
/*
* create file with indicator of whether zip code has above/below median
* exposure to covid 
*/

* crosswalk zip to county
import excel "${data_raw}/HUD-zip-tract/ZIP_COUNTY_032020.xlsx", clear firstrow
rename *, lower

* assign zips to county that most of the area falls into
bys zip (res_ratio) : gen n = _n
bys zip : egen max_n = max(n)
keep if n == max_n

* clean up 
keep zip county
ren county fips
ren zip zcta
destring *, replace force
tempfile cw_cty
save `cw_cty'

* SCI weighted cases
use "${data_derived_exposure}/sci_weighted_cases_short.dta", clear

* create log of SCI weighted cases
* also create logs of SCI weighted covariates (i.e. income, density etc)
foreach var of varlist sci_c_11-sci_pc_frac_urban_2010 {
	gen l_`var' = log(`var')
}

* rename variables
ren *med_hh_inc_2018* *inc* 
ren *pop_density_2018* *pop_dens*
ren *frac_urban_2010* *frc_urban*

* merge on population counts to be able to weight by population
* there will be a few places are only in the master file that don't get matched.
* These will all be in the Virgin islands or in American Samoa. We can drop them.
destring zcta, replace force
merge 1:1 zcta using "${data_derived_covs}/zip_covariates.dta",  ///
	 keep(3) keepusing(pop) nogen
	
* merge on 
merge 1:1 zcta using `cw_cty', assert(1 2 3) keep(3) nogen
	
* find median and other percentiles within counties
foreach i of numlist 5(5)95 {
	foreach var of varlist l_sci_* {
		qui: gen p`i'_`var' = `var'
	}
}

* create indicators for being above/below median
* both at county and at national level
* also create ventiles within county and nationally
* also create percentiles nationally (we can't do that within county because
* there are typically not enough zips in a county)
preserve
	collapse (p5) p5_* (p10) p10_* (p15) p15_* (p20) p20_* (p25) p25_* ///
		(p30) p30_* (p35) p35_* (p40) p40_* (p45) p45_* ///
		(p50) p50_* (p55) p55_* (p60) p60_* (p65) p65_* (p70) p70_* ///
		(p75) p75_* (p80) p80_* (p85) p85_* (p90) p90_* (p95) p95_* [w=pop], by(fips)
	tempfile median_within_cty
	save `median_within_cty'
restore 

drop p*_sci*
merge m:1 fips using `median_within_cty', nogen assert(3) keepusing(p*)

foreach suff in c_11 c_13 c_15 c_17 c_19 ///
	pc_c_11 pc_c_13 pc_c_15 pc_c_17 pc_c_19 ///
	pc_c_11_pc pc_c_13_pc pc_c_15_pc pc_c_17_pc pc_c_19_pc ///
	inc pop_dens frc_urban pc_inc pc_pop_dens pc_frc_urban {
	gen abv_md_cty_`suff' = l_sci_`suff' >= p50_l_sci_`suff' & ///
		l_sci_`suff' !=. 
	qui: xtile abv_md_nat_`suff' = l_sci_`suff' [w=pop], nq(2) // national
	qui: replace abv_md_nat_`suff' = abv_md_nat_`suff'-1
	
	* ventiles
	qui: gen vent_cty_`suff' = . 
	qui: replace vent_cty_`suff' = 1 if l_sci_`suff' < p5_l_sci_`suff'
	forval i = 10(5)95 {
		local vent =`i'/5
		local i_min_5 = `i'-5
		qui: replace vent_cty_`suff' = `vent' if l_sci_`suff' >= p`i_min_5'_l_sci_`suff' & ///
			l_sci_`suff' <= p`i'_l_sci_`suff' & l_sci_`suff' !=. 
	qui: replace vent_cty_`suff' = 20 if l_sci_`suff' > p95_l_sci_`suff' & ///
		l_sci_`suff' !=.
	}
	* ventiles nationally
	qui: xtile vent_nat_`suff' = l_sci_`suff' [w=pop], nq(20)
	
	* percentiles nationally
	qui: xtile perc_nat_`suff' = l_sci_`suff' [w=pop], nq(100)
}

* now focus on indicators and delete raw variables
keep zcta pop fips abv_md_* vent_* perc_* l_sci_*

* save
save "${data_derived_exposure}/sci_weighted_cases_rank", replace

